In [ ]:
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import re
import matplotlib
In [ ]:
matplotlib.use("TkAgg")
In [ ]:
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
data_dir = '../data/'
In [ ]:
macbeth_file = data_dir + 'macbeth.txt'
In [ ]:
caesar_file = data_dir + 'julius_caesar.txt'
In [ ]:
stopword_file = data_dir + 'long_stopwords.txt'
In [ ]:
stop_words = []
with open(stopword_file,'r') as inpFile:
lines = inpFile.readlines()
stop_words_temp = map(lambda x : re.sub('\n','',x),lines)
stop_words = map(lambda x: re.sub('[^A-Za-z0-9]+', '',x), stop_words_temp)
In [ ]:
stop_words
In [ ]:
type(stop_words)
In [ ]:
def clean(word):
word = word.strip()
word = word.lower()
word = re.sub('[^A-Za-z0-9]+', '', word)
if word not in stop_words:
return word
else:
return ''
In [ ]:
clean("king's")
In [ ]:
clean("they'll")
In [ ]:
line_count = 0
sentences = []
with open(macbeth_file,'r') as inpFile:
x = inpFile.readlines()
for line in x:
if line is not None or line != '\n':
words = line.split()
words = map(lambda x: clean(x), words)
words = filter(lambda x:True if len(x) > 0 else False, words)
sentences.append(words)
with open(caesar_file,'r') as inpFile:
x = inpFile.readlines()
for line in x:
if line is not None or line != '\n':
words = line.split()
words = map(lambda x: clean(x), words)
words = filter(lambda x:True if len(x) > 0 else False, words)
sentences.append(words)
In [ ]:
type(sentences)
In [ ]:
model = Word2Vec(sentences, window=5, size=500, workers=4, min_count=5)
In [ ]:
model.vocab
In [ ]:
labels = []
tokens = []
for word in model.vocab:
tokens.append(model[word])
labels.append(word)
In [ ]:
tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
In [ ]:
new_values = tsne_model.fit_transform(tokens)
In [ ]:
x = []
y = []
for value in new_values:
x.append(value[0])
y.append(value[1])
In [ ]:
plt.figure(figsize=(16, 12))
for i in range(len(x)):
plt.scatter(x[i],y[i])
plt.annotate(labels[i],
xy=(x[i], y[i]),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.show()
In [ ]:
model.most_similar(positive=['caesar','duncan'],negative=['scotland'])
In [ ]:
model.most_similar(positive=['caesar','duncan'],negative=['macbeth'])
In [ ]:
model.most_similar(positive=['caesar','macbeth'],negative=['banquo'])
In [ ]:
model.most_similar(positive=['rome','scotland'],negative=['banquo'])
In [ ]:
model.doesnt_match("duncan macbeth scotland banquo".split())
In [ ]: